AML Analytics Test


In [1]:
ls


AML Test.ipynb         Parallel_processing.py     install-apache.sh.txt
Beautifulsoup_Test.py  Scatterplot Sandbox.ipynb  mechanize.py
Import_csv.py          TestData.csv               scripts/
MRJob.py               fig1.png                   wakaripython/

In [2]:
#Data Mine
#http://youtu.be/p8hle-ni-DM
#http://youtu.be/eRpFC2CKvao?list=PLyBBc46Y6aAz54aOUgKXXyTcEmpMisAq3

#Data = TestData.csv or <'Dataset.csv'>
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

#Get Data
testdata=open('TestData.csv')

#Read csv
testdata=pd.read_csv('TestData.csv')

#View .head
testdata.head()


Out[2]:
   1  AccountNo_t   Amount_t                                   BBI_t  \
0  2    326001432  13,175.00                                     NaN   
1  3     18604455   2,584.60                                     NaN   
2  4     20038151  49,988.00  {6500}/ACC/ BOC NY LESS COMM USD 12.00   
3  5     20038151  49,988.00  {6500}/ACC/ BOC NY LESS COMM USD 12.00   
4  6   7680011476  10,068.00                                     NaN   

  BeneBank_t                        Beneficiary_t BNFADDR1_t BNFADDR2_t  \
0        NaN  ARCHINET DESIGNERS AND PLANNERS INC        NaN        NaN   
1        NaN               UNITED SHEETMETAL, INC        NaN        NaN   
2        NaN                          JINLONG HAN        NaN        NaN   
3        NaN                          JINLONG HAN        NaN        NaN   
4        NaN                        HSIN-HUA TENG        NaN        NaN   

  BNFADDR3_t     BNFID_t ... REFIMAD_t       Reference_t SenderABA_t  \
0        NaN   326001432 ...       NaN   USO130331000289    26014685   
1        NaN    18604455 ...       NaN    S063091044AC01    21000089   
2        NaN    20038151 ...       NaN  130401MS70060700    26003269   
3        NaN    20038151 ...       NaN  130401MS70056200    26003269   
4        NaN  7680011476 ...       NaN          2.01E+15    26009593   

         SenderName_t PaymtSource_t   Time_t  UserID_t  ValueDate_t  \
0  CHINA CONSTRUCTION           FLS  9:25:47     JANEZ  4/1/13 0:00   
1         CITIBANK NA           FLS  9:25:40     JIANY  4/1/13 0:00   
2                 NaN           FLS  9:25:32     JANEZ  4/1/13 0:00   
3                 NaN           FLS  9:25:23     JANEZ  4/1/13 0:00   
4         BK AMER NYC           FLS  9:25:16     JIANY  4/1/13 0:00   

                createdate  NA  
0  2013-04-01T09:25:00.00Z NaN  
1  2013-04-01T09:25:00.00Z NaN  
2  2013-04-01T09:25:00.00Z NaN  
3  2013-04-01T09:25:00.00Z NaN  
4  2013-04-01T09:25:00.00Z NaN  

[5 rows x 45 columns]

In [3]:
#View .tail
testdata.tail()


Out[3]:
        1  AccountNo_t    Amount_t BBI_t BeneBank_t  \
994   996      8600821  379,412.62   NaN        NaN   
995   997   2000470238   61,460.53   NaN        NaN   
996   998   2000470238    4,385.00   NaN        NaN   
997   999     22610472    7,265.00   NaN        NaN   
998  1000     18302262  291,646.59   NaN        NaN   

                      Beneficiary_t                   BNFADDR1_t  \
994               SIG INTERNATIONAL                          NaN   
995       WEDGEWOOD ENTERPRISE CORP                          NaN   
996                   SCOTTRADE INC                          NaN   
997                U.S. TAMEX CORP.   12910 MULBERRY DR., UNIT A   
998  FARADAY TECHNOLOGY CORPORATION  3945 FREEDOM CIRCLE STE 200   

                BNFADDR2_t BNFADDR3_t     BNFID_t ... REFIMAD_t  \
994                    NaN        NaN     8600821 ...       NaN   
995                    NaN        NaN  3252100000 ...       NaN   
996                    NaN        NaN     8611343 ...       NaN   
997                WHITIER   90602 US    22610472 ...       NaN   
998  SANTA CLARA, CA 95054        NaN    18302262 ...       NaN   

          Reference_t SenderABA_t       SenderName_t PaymtSource_t    Time_t  \
994    S063107192CE01    21000089        CITIBANK NA           FLS  12:01:13   
995          2.01E+15   121100782            BK WEST           NaN  12:01:18   
996  GDGUS3336026724H    26014591  INDUSTRIAL & COMM           NaN  13:56:10   
997          2.01E+15    26009593        BK AMER NYC           NaN  13:58:04   
998    LCT31060661200    21000089        CITIBANK NA           FLS  13:58:26   

     UserID_t   ValueDate_t               createdate  NA  
994     JIANY  4/17/13 0:00  2013-04-17T12:01:00.00Z NaN  
995  ZHAOXIAC  4/17/13 0:00  2013-04-17T12:01:00.00Z NaN  
996     JIANY  4/16/13 0:00  2013-04-16T13:56:00.00Z NaN  
997     JIANY  4/16/13 0:00  2013-04-16T13:58:00.00Z NaN  
998     JIANY  4/16/13 0:00  2013-04-16T13:58:00.00Z NaN  

[5 rows x 45 columns]

Create DataFrame


In [4]:
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

#Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])
df

#Use .head and .tail; or df.describe()
df.head()


Out[4]:
   AccountNo_t   Amount_t
0    326001432  13,175.00
1     18604455   2,584.60
2     20038151  49,988.00
3     20038151  49,988.00
4   7680011476  10,068.00

Create Simple Test Plot


In [5]:
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

#Load Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])
df

#Create SimplePlot
figure()
df.plot()
xlabel('AccountNo_t')
ylabel('Amount')
title('AccountNo_t vs Amount')
show()


<matplotlib.figure.Figure at 0x7f83927db990>

Create plot with matplotlib object-oriented API


In [ ]:
from IPython.core.display import HTML
#http://nbviewer.ipython.org/github/jrjohansson/scientific-python-lectures/blob/master/Lecture-4-Matplotlib.ipynb
HTML("<iframe src=http://nbviewer.ipython.org/github/jrjohansson/scientific-python-lectures/blob/master/Lecture-4-Matplotlib.ipynb width=400 height=350></iframe>")

In [2]:
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

#Load Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])
df

#Create Plot with matplotlib object-oriented API

#Declare 'fig1' variable assign 'plt.figure()' value
fig1 = plt.figure(figsize=(8,4), dpi=200)

df.plot()
xlabel('AccountNo')
ylabel('Amount')
title('AccountNo vs Amount')
plt.show()

plt.iplot(df_to_iplot(df))

#Save fig
fig1.savefig("fig1.png", dpi=200)

Create Filter where 'Amount' <= 10000


In [1]:
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

#Read Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])

#Create Filter
df[df['Amount_t']<='10000']


#Create Plot with matplotlib object-oriented API

#Declare 'fig1' variable assign 'plt.figure()' value
fig1 = plt.figure(figsize=(8,4), dpi=200)

df.plot()
xlabel('AccountNo')
ylabel('Amount')
title('AccountNo vs Amount')
plt.show()

#Save fig
fig1.savefig("fig1.png", dpi=200)


<matplotlib.figure.Figure at 0x7f1ec9c92990>

Declare DataFrame Variables


In [ ]:
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

#Read Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])

#Create Filter
df[df['Amount_t']<='10000']

#Declare DataFrame Variables
df['Amount_t'].describe()
Amount = ('Amount_t')
Volume = count('AccountNo_t')
grouping -df.groupby['AccountNo_t']
x = 'Volume'
y = 'Amount'
                     
#Slicing
df.ix[0:, ['Volume', 'Amount']]


#Use .head and .tail
df.head()

In [3]:
#Search
data['AccountNo_t']=='1020234'

Scatter Plot Test


In [1]:
#Scatter plot
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

#Read Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])

#Declare 'fig2' variable assign 'plt.figure()' value
fig2 = plt.figure(figsize=(8,4), dpi=200)

#Create Random Scatter Plot
N = 50
x = np.random.rand(N)
y = np.random.rand(N)
colors = np.random.rand(N)
area = np.pi * (15 * np.random.rand(N))**2 # 0 to 15 point radiuses

plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.show()

#Save fig
fig2.savefig("fig2.png", dpi=200)



In [1]:
#Scatter plot
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from pylab import *

#Load Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])

# Plot outputs
pl.scatter(x, y, color='red')
pl.plot(x, regr.predict(x), color='blue', linewidth=3)

pl.xticks(())
pl.yticks(())

pl.show()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-fb072f9ac789> in <module>()
     17 
     18 # Plot outputs
---> 19 pl.scatter(x, y, color='red')
     20 pl.plot(x, regr.predict(x), color='blue', linewidth=3)
     21 

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/matplotlib/pyplot.pyc in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, hold, **kwargs)
   3198         ret = ax.scatter(x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm,
   3199                          vmin=vmin, vmax=vmax, alpha=alpha,
-> 3200                          linewidths=linewidths, verts=verts, **kwargs)
   3201         draw_if_interactive()
   3202     finally:

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, **kwargs)
   3673             self.set_ymargin(0.05)
   3674 
-> 3675         self.add_collection(collection)
   3676         self.autoscale_view()
   3677 

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/matplotlib/axes/_base.pyc in add_collection(self, collection, autolim)
   1457 
   1458         if autolim:
-> 1459             self.update_datalim(collection.get_datalim(self.transData))
   1460 
   1461         collection._remove_method = lambda h: self.collections.remove(h)

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/matplotlib/collections.pyc in get_datalim(self, transData)
    187             transOffset = transOffset.get_affine()
    188 
--> 189         offsets = np.asanyarray(offsets, np.float_)
    190         if np.ma.isMaskedArray(offsets):
    191             offsets = offsets.filled(np.nan)

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/numpy/core/numeric.pyc in asanyarray(a, dtype, order)
    512 
    513     """
--> 514     return array(a, dtype, copy=False, order=order, subok=True)
    515 
    516 def ascontiguousarray(a, dtype=None):

ValueError: could not convert string to float: AccountNo_t
/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/IPython/core/formatters.py:239: FormatterWarning: Exception in image/png formatter: could not convert string to float: AccountNo_t
  FormatterWarning,
<matplotlib.figure.Figure at 0x7fad7011f4d0>

Linear Regression


In [1]:
#http://nbviewer.ipython.org/gist/fonnesbeck/5850463

%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *
from sklearn import datasets, linear_model

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

#Linear Regression

#Load Data Source
testdata=pd.read_csv('TestData.csv')

#Declare 'fig3' variable assign 'plt.figure()' value
fig3 = plt.figure(figsize=(8,4), dpi=200)

# Use only one feature
#Build Dataframe
aml = pd.DataFrame(data = testdata, columns=['Amount_t'])

aml_x = aml[:, np.newaxis]
aml_x_temp = aml_x[:, :, 2]

# Split the data into training/testing sets
aml_x_train = aml_x_temp[:-20]
aml_x_test = aml_x_temp[-20:]

from sklearn.datasets.samples_generator import make_regression

# this is our test set, it's just a straight line with some
# gaussian noise
X, Y = make_regression(n_samples=100, n_features=1, n_informative=1,\
                        random_state=0, noise=35)


# Split the targets into training/testing sets
aml_y_train = aml.target[:-20]
aml_y_test = aml.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(aml_x_train, aml_x_train)

# The coefficients
print 'Coefficients: \n', regr.coef_
# The mean square error
print ("Residual sum of squares: %.2f" %
        np.mean((regr.predict(aml_x_test) - aml_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print ('Variance score: %.2f' % regr.score(aml_x_test, aml_y_test))


# output = CTR[Amount_t['Total'] < 9999.99]
#output.plot(kind='bar')

#Save fig
fig3.savefig("fig3.png", dpi=200)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1-553389565a09> in <module>()
     24 aml = pd.DataFrame(data = testdata, columns=['Amount_t'])
     25 
---> 26 aml_x = aml[:, np.newaxis]
     27 aml_x_temp = aml_x[:, :, 2]
     28 

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1778             return self._getitem_multilevel(key)
   1779         else:
-> 1780             return self._getitem_column(key)
   1781 
   1782     def _getitem_column(self, key):

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1785         # get column
   1786         if self.columns.is_unique:
-> 1787             return self._get_item_cache(key)
   1788 
   1789         # duplicate columns & possible reduce dimensionaility

/opt/anaconda/envs/np18py27-1.9/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1054         """ return the cached item, item represents a label indexer """
   1055         cache = self._item_cache
-> 1056         res = cache.get(item)
   1057         if res is None:
   1058             values = self._data.get(item)

TypeError: unhashable type
<matplotlib.figure.Figure at 0x7f897540abd0>

Test Scatterplot and Distribution Workflow


In [3]:
#http://nbviewer.ipython.org/gist/fonnesbeck/5850463

%matplotlib inline
import pandas as pd
import pylab
import matplotlib.pyplot as plt
import numpy as np
from pandas import *

#Load Data Source
testdata=pd.read_csv('TestData.csv')

#Build Dataframe
df = pd.DataFrame(data = testdata, columns=['AccountNo_t', 'Amount_t'])

#Declare 'fig2' variable assign 'plt.figure()' value
fig4 = plt.figure(figsize=(8,4), dpi=200)

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

with mpl.rc_context(rc={'font.family': 'serif', 'font.weight': 'bold', 'font.size': 8}):
    fig = plt.figure(figsize=(6,3))
    ax1 = fig.add_subplot(121)
    ax1.set_xlabel('AccountNo')
    ax1.set_ylabel('Amount')
    ax1.set_title("Scatterplot")
    plt.plot(np.random.normal(size=100), np.random.normal(size=100), 'r.')
    ax2 = fig.add_subplot(122)
    plt.hist(np.random.normal(size=100), bins=15)
    ax2.set_xlabel('sample')
    ax2.set_ylabel('cumulative sum')
    ax2.set_title("Normal distrubution")
    plt.tight_layout()
    plt.savefig("fig4.png", dpi=200)

SOLR API


In [ ]:
# https://pypi.python.org/pypi/solrpy/
import solr

# create a connection to a solr server
s = solr.SolrConnection('http://50.255.26.173:8983/solr')

# add a document to the index
doc = dict(
    id=1,
    title='Lucene in Action',
    author=['Erik Hatcher', 'Otis Gospodnetić'],
    )
s.add(doc, commit=True)

# do a search
response = s.query('title:lucene')
for hit in response.results:
    print hit['title']

Common Logic Framework ISO/IEC 24707


In [ ]:
from IPython.core.display import HTML
# http://sourceforge.net/projects/commonlogic/
HTML("<iframe src=http://sourceforge.net/projects/commonlogic/ width=400 height=350></iframe>")

In [ ]:
from IPython.core.display import HTML
# http://commonlogic.sourceforge.net/
HTML("<iframe src=http://commonlogic.sourceforge.net/ width=400 height=350></iframe>")

Sandbox


In [ ]:
%matplotlib inline
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np
from pandas import *

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

plt.plot(np.random.normal(size=100), np.random.normal(size=100), 'ro')

Pandas + Plotly


In [ ]:
from IPython.core.display import HTML
#http://nbviewer.ipython.org/gist/nipunreddevil/7734529
HTML("<iframe src=http://nbviewer.ipython.org/gist/nipunreddevil/7734529 width=400 height=350></iframe>")